
# coding: utf-8

# In[1]:

import pandas as pd
import time
import gc
from datetime import datetime
from collections import Counter


# In[3]:

# ---------------------
# IMPORT FILES
INPT = pd.read_csv('USPTO_KR_assg.csv',sep='*',encoding="ISO-8859-1")
TRGT = pd.read_csv('dg_namstand_all.csv',sep='*',encoding="ISO-8859-1")
print(list(INPT)); print(list(TRGT)); print('INTP --',len(INPT),' ','TRGT --',len(TRGT))


# In[4]:

# ---------------------
# Set variable names
INPT = INPT[['standard_name','stem_name','assgid']].drop_duplicates().reset_index(drop=True)
INPT = INPT.rename(columns={'stem_name':'stem','assgid':'key'})
INPT = INPT.fillna('NoName')
INPT['nb'] = INPT.index

TRGT = TRGT[['standard_name','stem_name','Symbol']].drop_duplicates().reset_index(drop=True)
TRGT = TRGT.rename(columns={'stem_name':'stem','Symbol':'key'})
TRGT = TRGT.fillna('NoName')
TRGT['nb'] = TRGT.index

print(list(INPT)); print(list(TRGT)); print('INTP --',len(INPT),' ','TRGT --',len(TRGT))

# merge by assg_id and gvkey to wku after matching <assg_id : gvkey>


# In[40]:

# ---------------------
# DEFAULT MATCH PARAMETERS
threshold = 110 # only accept possible matches with a score higher than this
                # note: score is sum of wts of words, wt = 100/word_frequency 
lowhold = 45    # if score is above this AND relative score > 100, then accept
                # relative score is 100*score / score_of_target_name
maxno = 5       # don't report if there are more than this no. above threshold AND no complete matches
# ---------------------

# main global variables
srcnames = INPT[['stem','nb']]
#srcnames = TRGT[['stem','nb']]
srcnames = {k: g['stem'].tolist() for k,g in srcnames.groupby("nb")}

srcfull = INPT[['standard_name','nb']]
#srcfull = TRGT[['standard_name','nb']]
srcfull = {k: g['nb'].tolist() for k,g in srcfull.groupby('standard_name')}

srckey =INPT[['key','nb']] # key might be gvkey or assg_nb 
#srckey = TRGT[['key','nb']]
trgkey = TRGT[['key','nb']]
#trgkey = INPT[['key','nb']]

srcscores = INPT[['stem','nb']]; print('srcscores --',len(srcscores)) # add score to this dataframe
#srcscores = TRGT[['stem','nb']]; print('srcscores --',len(srcscores))


# In[41]:

#-----------------------------------------------
# WORD TOKEN DICTIONARY

# process one line at a time
start_time = time.time()
wcounts=[] # container for word tokens appearing in the names

INPT_stem = INPT['stem'].tolist()
#TRGT_stem = TRGT['stem'].tolist()

for stem in INPT_stem:
#for stem in TRGT_stem:
    if stem == 'NoName': continue    
    words = stem.split() # parse name into array of words - specify delimeter if you want (e.g. stem.split(","))
    wcounts = wcounts + words # add word tokens to wcounts 
    
print('Word Token Dictionary Done --- %s min ---' % ((time.time() - start_time)/60)) # About 30 minutes


# In[42]:

#-----------------------------------------------            
# WORD TOKEN FREQUENCY DICTIONARY 

counts = [] 
start_time = time.time()
counts = Counter(wcounts)
for key in counts:    
    counts[key] =  100/counts[key]
    
print('Word Token Frequency Dictionary Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min


# In[43]:

print(counts['ALZA'])


# In[44]:

#-----------------------------------------------        
# SRCSCORES -- FOR THOSE UNMATCHED IN STEP 1

def calscore(data):
    try:
        words = data.split()
        sss = [counts[j] for j in words]
        total = sum(sss)
        return total
    except: 
        return 'error'

start_time = time.time()
srcscores['srcscore'] = srcscores['stem'].apply(lambda x: calscore(x))
srcscores = srcscores[~(srcscores.stem == 'NoName')]

srcscores_dict = {k: g['srcscore'].tolist() for k,g in srcscores[['nb','srcscore']].groupby("nb")}

print('SRCSCORES Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min
print(len(srcscores))


# In[45]:

print(srcscores[:5])
print(srcscores_dict[1][0])


# In[46]:

#-----------------------------------------------        
# WORDID DICTIONARY -- FOR THOSE UNMATCHED IN STEP 1

wordid = {} # below
start_time = time.time()
for i,row in srcscores.iterrows():
    stem = row.stem
    nb = int(row['nb'])
    if stem == 'NoName': continue
        
    words = stem.split() # parse name into array of words - specify delimeter if you want (e.g. stem.split(","))
    
    # add id to the list of ids using that word (except for first)
    for word in words:
        if word in wordid:
            wordid[word].append(nb)
        else:
            wordid[word]=[nb]

print('WORDID Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min


# In[47]:

print(wordid['CJ'])


# In[48]:

# ---------------------------------------------------------------
# MATCHING WITH TRGT

# one target at a time    
start_time = time.time()
perfect=[]; over_threshold=[]

for j,line in TRGT.iterrows():
#for j,line in INPT.iterrows():
    score={}; # declare containers
    target = line.stem.split() # word tokens in target stem
    trg_nb = line.nb
    stdname = line.standard_name
    stem = line.stem
    
    # calcualte match score for the target 
    for tword in target:
        if tword in wordid:
            wt = counts[tword] # get the score of the tword
            ids = wordid[tword] # make list of all ids using tword
        else: continue
        for nb in ids: # update score
            if nb in score:
                score[nb]=score[nb]+wt
            else:
                score[nb]=wt

    # test each score
    # first, check for exact matches
    if srcfull.get(stdname, 0) != 0: 
        for id in srcfull[stdname]:
            temp = {'src_id':id,'src_name':stdname,'trg_id':trg_nb,'trg_name':stdname}
            perfect.append(temp)
            
    else: # in case of no perfect match 
        for key, value in score.items():
            sc = srcscores_dict[key][0]
            relscore = 100*value/sc
            if value > threshold:
                temp = {'src_id':key,'src_stem':srcnames[key][0],'trg_id':trg_nb,'trg_stem':stem,'sc':value,'rsc':relscore}
                over_threshold.append(temp)
            else:
                if (relscore >= 100) and (value > lowhold): # & -> and 
                    temp = {'src_id':key,'src_stem':srcnames[key][0],'trg_id':trg_nb,'trg_stem':stem,'sc':value,'rsc':relscore}
                    over_threshold.append(temp) 
                else:
                    continue
                    
    if j % 10000 == 0: print(j,'--- %s min ---' % ((time.time() - start_time)/60))
        
print('MATCHING Done --- %s min ---' % ((time.time() - start_time)/60)) # Less than 1 min

perfect = pd.DataFrame(perfect)
over_threshold = pd.DataFrame(over_threshold)   


# In[ ]:

### SAVE (ASSIGNEE TO COMPUSTAT) ###


# In[49]:

# Match Assignee to Compustat
perfect = pd.merge(perfect, srckey, left_on='src_id',right_on='nb',how='inner')
perfect = perfect[['key','src_name','trg_id','trg_name']]
perfect = perfect.rename(columns = {'key':'assgid'})
perfect = pd.merge(perfect, trgkey, left_on='trg_id',right_on='nb',how='inner')
perfect = perfect[['assgid','src_name','key','trg_name']]; perfect = perfect.rename(columns = {'key':'Symbol'})
print(perfect[:10]); print(len(perfect))


# In[50]:

# Match Assignee to Compustat
over_threshold = pd.merge(over_threshold, srckey, left_on='src_id',right_on='nb',how='inner')
over_threshold = over_threshold[['key','src_stem','trg_id','trg_stem','sc','rsc']] 
over_threshold = over_threshold.rename(columns = {'key':'assgid'})
over_threshold = pd.merge(over_threshold, trgkey, left_on='trg_id',right_on='nb',how='inner')
over_threshold = over_threshold[['assgid','src_stem','key','trg_stem','sc','rsc']]; over_threshold = over_threshold.rename(columns = {'key':'Symbol'})
print(over_threshold[:10]); print(len(over_threshold))


# In[51]:

# Match Assignee to Compustat
perfect.to_csv('matched_uspto_perfect_k2d.csv',sep='*')
over_threshold.to_csv('matched_uspto_scorebased_k2d.csv',sep='*')


# In[26]:

### SAVE (COMPUSTAT TO ASSIGNEE) ###


# In[35]:

# Match Compustat to Assignee
perfect = pd.merge(perfect, srckey, left_on='src_id',right_on='nb',how='inner')
perfect = perfect[['key','src_name','trg_id','trg_name']]
perfect = perfect.rename(columns = {'key':'Symbol'})
perfect = pd.merge(perfect, trgkey, left_on='trg_id',right_on='nb',how='inner')
perfect = perfect[['Symbol','src_name','key','trg_name']]; perfect = perfect.rename(columns = {'key':'assgid'})
print(perfect[:10]); print(len(perfect))


# In[38]:

# Match Compustat to Assignee
over_threshold = pd.merge(over_threshold, srckey, left_on='src_id',right_on='nb',how='inner')
over_threshold = over_threshold[['key','src_stem','trg_id','trg_stem','sc','rsc']]
over_threshold = over_threshold.rename(columns = {'key':'Symbol'})
over_threshold = pd.merge(over_threshold, trgkey, left_on='trg_id',right_on='nb',how='inner')
over_threshold = over_threshold[['Symbol','src_stem','key','trg_stem','sc','rsc']]; over_threshold = over_threshold.rename(columns = {'key':'assgid'})
print(over_threshold[:10]); print(len(over_threshold))


# In[39]:

perfect.to_csv('matched_uspto_perfect_d2k.csv',sep='*')
over_threshold.to_csv('matched_uspto_scorebased_d2k.csv',sep='*')


# In[ ]:



